1 PROBLEMA: Predecir la diabetes según las medidas de diagnóstico

LIBRERÍAS A UTILIZAR EN EL PROBLEMA

library(naniar)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(fdth)
## 
## Attaching package: 'fdth'
## The following objects are masked from 'package:stats':
## 
##     sd, var
library(agricolae)
library(UsingR)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer

1.1 LECTURA DE DATOS

datas<- read.csv("diabetes.csv", header=T, sep=",", dec=".", fill=F)
head(datas)
##   patient_number cholesterol glucose hdl_chol chol_hdl_ratio age gender height
## 1              1         193      77       49            3,9  19 female     61
## 2              2         146      79       41            3,6  19 female     60
## 3              3         217      75       54              4  20 female     67
## 4              4         226      97       70            3,2  20 female     64
## 5              5         164      91       67            2,4  20 female     70
## 6              6         170      69       64            2,7  20 female     64
##   weight  bmi systolic_bp diastolic_bp waist hip waist_hip_ratio    diabetes
## 1    119 22,5         118           70    32  38            0,84 No diabetes
## 2    135 26,4         108           58    33  40            0,83 No diabetes
## 3    187 29,3         110           72    40  45            0,89 No diabetes
## 4    114 19,6         122           64    31  39            0,79 No diabetes
## 5    141 20,2         122           86    32  39            0,82 No diabetes
## 6    161 27,6         108           70    37  40            0,93 No diabetes
tail(datas)
##     patient_number cholesterol glucose hdl_chol chol_hdl_ratio age gender
## 385            385         255     112       34            7,5  82   male
## 386            386         227     105       44            5,2  83 female
## 387            387         226     279       52            4,3  84 female
## 388            388         301      90      118            2,6  89 female
## 389            389         232     184      114              2  91 female
## 390            390         165      94       69            2,4  92 female
##     height weight  bmi systolic_bp diastolic_bp waist hip waist_hip_ratio
## 385     66    163 26,3         179           89    37  43            0,86
## 386     59    125 25,2         150           90    35  40            0,88
## 387     60    192 37,5         144           88    41  48            0,85
## 388     61    115 21,7         218           90    31  41            0,76
## 389     61    127   24         170           82    35  38            0,92
## 390     62    217 39,7         160           82    51  51               1
##        diabetes
## 385 No diabetes
## 386 No diabetes
## 387    Diabetes
## 388 No diabetes
## 389    Diabetes
## 390 No diabetes
attach(datas)
pct_miss(datas)
## [1] 0

Un análisis inicial muestra que no existen datos faltantes por lo que no hace falta omitir o imputar.

1.2 TIPO DE VARIABLES

1.3 vARIABLES CUALITATIVAS

1.4 ANÁLISIS DE VARIABLES CUANTITATIVAS

1.4.1 TABLAS DE FRECUENCIAS Y DISTRIBUCIÓN

TABLA DE FRECUENCIAS SIMPLE

#Tabla de frecuencias 
tab_gluc<-table(glucose)
sum(tab_gluc)
## [1] 390
tab_gluc_abs<-prop.table(tab_gluc)
sum(tab_gluc_abs)
## [1] 1
tab_gluc<-c(tab_gluc,sum(tab_gluc))

tab_gluc_abs<-c(tab_gluc_abs, sum(tab_gluc_abs))


nums<-c(names(tab_gluc))
nums[length(nums)]="Totales"

tab_freq_gluc<-matrix(cbind(tab_gluc, tab_gluc_abs),byrow=T,nrow = 2,ncol=length(nums))
length(tab_freq_gluc)
## [1] 234
colnames(tab_freq_gluc)<-nums
rownames(tab_freq_gluc)<-c("fi", "pi")
tab_freq_gluc
##             48          52          54          56          57          58
## fi 1.000000000 1.000000000 1.000000000 2.000000000 1.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.005128205 0.002564103 0.002564103
##             59          60          62          64          65          66
## fi 1.000000000 1.000000000 1.000000000 2.000000000 2.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.005128205 0.005128205 0.002564103
##            67          68         69          70         71          72
## fi 4.00000000 2.000000000 4.00000000 3.000000000 6.00000000 1.000000000
## pi 0.01025641 0.005128205 0.01025641 0.007692308 0.01538462 0.002564103
##             73          74         75          76          77         78
## fi 2.000000000 10.00000000 8.00000000 10.00000000 11.00000000 5.00000000
## pi 0.005128205  0.02564103 0.02051282  0.02564103  0.02820513 0.01282051
##            79         80          81          82          83          84
## fi 7.00000000 6.00000000 15.00000000 10.00000000 11.00000000 12.00000000
## pi 0.01794872 0.01538462  0.03846154  0.02564103  0.02820513  0.03076923
##             85         86          87         88         89          90
## fi 18.00000000 7.00000000 12.00000000 9.00000000 6.00000000 10.00000000
## pi  0.04615385 0.01794872  0.03076923 0.02307692 0.01538462  0.02564103
##             91          92          93         94         95          96
## fi 10.00000000 14.00000000 3.000000000 8.00000000 7.00000000 2.000000000
## pi  0.02564103  0.03589744 0.007692308 0.02051282 0.01794872 0.005128205
##            97          98          99        100        101         102
## fi 7.00000000 3.000000000 2.000000000 5.00000000 8.00000000 3.000000000
## pi 0.01794872 0.007692308 0.005128205 0.01282051 0.02051282 0.007692308
##            103         104        105        106         107         108
## fi 3.000000000 2.000000000 5.00000000 5.00000000 1.000000000 2.000000000
## pi 0.007692308 0.005128205 0.01282051 0.01282051 0.002564103 0.005128205
##            109         110         111        112         113         115
## fi 3.000000000 2.000000000 3.000000000 5.00000000 2.000000000 3.000000000
## pi 0.007692308 0.005128205 0.007692308 0.01282051 0.005128205 0.007692308
##            117         118         119        120         121         122
## fi 1.000000000 3.000000000 3.000000000 5.00000000 2.000000000 2.000000000
## pi 0.002564103 0.007692308 0.007692308 0.01282051 0.005128205 0.005128205
##            124         125         126         128         130         131
## fi 1.000000000 1.000000000 2.000000000 2.000000000 2.000000000 1.000000000
## pi 0.002564103 0.002564103 0.005128205 0.005128205 0.005128205 0.002564103
##            133         138         145         153         155         161
## fi 1.000000000 1.000000000 1.000000000 1.000000000 3.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.002564103 0.007692308 0.002564103
##            171         172         173         174         176         177
## fi 1.000000000 1.000000000 3.000000000 1.000000000 1.000000000 1.000000000
## pi 0.002564103 0.002564103 0.007692308 0.002564103 0.002564103 0.002564103
##            182         184         185         187         193         196
## fi 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.002564103 0.002564103 0.002564103
##            197         203         206         223         225         228
## fi 2.000000000 1.000000000 3.000000000 1.000000000 2.000000000 1.000000000
## pi 0.005128205 0.002564103 0.007692308 0.002564103 0.005128205 0.002564103
##            233         235         236         239         248         251
## fi 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.002564103 0.002564103 0.002564103
##            255         262         267         269         270         279
## fi 1.000000000 1.000000000 1.000000000 1.000000000 2.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.002564103 0.005128205 0.002564103
##            297         299         330         341         342         369
## fi 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000 1.000000000
## pi 0.002564103 0.002564103 0.002564103 0.002564103 0.002564103 0.002564103
##            371         385 Totales
## fi 1.000000000 1.000000000     390
## pi 0.002564103 0.002564103       1

TABLA DE FRECUENCIAS COMPLETA:

INTERVALO STURGES

tabla_completa<- fdt(glucose,breaks="Sturges",right=T); tabla_completa
##       Class limits   f   rf rf(%)  cf  cf(%)
##     (47.52,81.653] 109 0.28 27.95 109  27.95
##   (81.653,115.786] 203 0.52 52.05 312  80.00
##  (115.786,149.919]  28 0.07  7.18 340  87.18
##  (149.919,184.052]  15 0.04  3.85 355  91.03
##  (184.052,218.185]  10 0.03  2.56 365  93.59
##  (218.185,252.318]  10 0.03  2.56 375  96.15
##  (252.318,286.451]   7 0.02  1.79 382  97.95
##  (286.451,320.584]   2 0.01  0.51 384  98.46
##  (320.584,354.717]   3 0.01  0.77 387  99.23
##   (354.717,388.85]   3 0.01  0.77 390 100.00

A partir de la tabla de frecuencias se observa que la mayor frecuencia está en la segunda clase, es decir, en el intervalo de 81.653 a 115.786, lo cual indica que de la población estudiada, la mayoría tiene un nivel de glucosa menor a 140 mg/dl, el cual es un nivel saludable.

1.4.2 ANÁLISIS GRÁFICO

1.4.2.1 HISTOGRAMA

#Histograma

hist_glu<-hist(glucose, breaks = seq(40,400,30), plot=T, col = heat.colors(12), prob=T, 
               main="Histograma de Glucosa", xlab="Nivel de Glucosa (mg/dl)", ylab="%")
lines(density(glucose), col="purple", lwd=3 )

1.4.2.2 POLIGONO DE FRECUENCIAS

#Poligono
simple.freqpoly(glucose,col=heat.colors(12), lwd=3 , breaks = seq(40,400,30), main="POLÍGONO DE 
                FRECUENCIAS", xlab="Nivel de Glucosa (mg/dl)", ylab="# de Pacientes")

Las 2 primeras gráficas, Histograma y Polígono de Frecuencias respectivamente confirman que la distribución es unimodal con niveles de Glucosa Saludables

1.4.2.3 DIAGRAMA DE CAJA Y BRAZOS

#Caja y brazos
help("plot_ly")
## starting httpd help server ... done
caja<-plot_ly(datas, x= ~glucose, type="box", name="Distribución Glucosa")
layout(p=caja, title= "Caja y Brazos", colorway="red", xaxis=list(title="Glucosa MG/DL"))

El Diagrama de Caja igual confirma la unimodalidad de la distribución con la señalización de la mediana sesgada al lado Izquierdo, sin embargo también expone de manera clara la existencia de múltiples outliers distribuidos no de manera uniforme pero sí con cierta constancia en valores mayores a 150 mg/dl lo cual indica pacientes con alerta de prediabetes o diabetes tipo 2.

1.4.2.4 Medidad Descriptivas

1.- MODA

mfv(glucose)
## [1] 85

2.- MEDIANA

Utilizaremos la fórmula

median(glucose)
## [1] 90
li=81.653
nn=length(glucose)
FA=109
fmd=203
c=115.786-81.653

print("La mediana obtenida por la fórmula es:")
## [1] "La mediana obtenida por la fórmula es:"
mediana=li+(((nn/2)-FA)/fmd)*c ; mediana
## [1] 96.11329

3.- MEDIA

mean(glucose)
## [1] 107.3385

4.- COEFICIENTE DE ASIMETRÍA

skewness(glucose)
## [1] 2.711121

ASIMETRÍA POSITIVA El Coeficiente de Asimetría es mayor a 0, por lo que tiene una tendencia a acumularse del lado izquierdo.

5.- CURTOSIS

kurtosis(glucose)
## [1] 7.905913

La Curtosis indica una curva LEPTOCÚRTICA(MUY PICUDA)

6.- COEFICIENTE DE DESVIACIÓN

sd(glucose)/mean(glucose)
## [1] 0.5012014

1.4.3 OJIVA

Para unir la gráfica interactiva y los puntos a analizar se realizó la Ojiva a partir de la tabla de frecuencias con los intervalos de clase a partir de los percentiles cada 5%, posteriormente se obtienen los valores de cada percentil solicitado (0.15, 0.60, 0.95) y finalmente se colocó una sombra circular sobre el área que lo indica.

CUANTILES

quantile(glucose, probs=c(0.15,0.6, 0.95))
##   15%   60%   95% 
##  76.0  94.0 234.1

OJIVA

pru<-hist(glucose, breaks=quantile(glucose, probs = (seq(0,1,0.05)) ), plot=F)

hist_glu<-pru
n1<-length(hist_glu$breaks)
tab_glu_oji<- cbind(hist_glu$breaks[-n1],hist_glu$breaks[-1],
                 hist_glu$counts,
                 hist_glu$counts/sum(hist_glu$counts),
                 cumsum(hist_glu$counts),
                 cumsum(hist_glu$counts/sum(hist_glu$counts)))
tab_glu_oji
##       [,1]   [,2] [,3]       [,4] [,5]       [,6]
## 0%   48.00  68.00   21 0.05384615   21 0.05384615
## 5%   68.00  74.00   26 0.06666667   47 0.12051282
## 10%  74.00  76.00   18 0.04615385   65 0.16666667
## 15%  76.00  78.00   16 0.04102564   81 0.20769231
## 20%  78.00  81.00   28 0.07179487  109 0.27948718
## 25%  81.00  82.00   10 0.02564103  119 0.30512821
## 30%  82.00  84.00   23 0.05897436  142 0.36410256
## 35%  84.00  85.00   18 0.04615385  160 0.41025641
## 40%  85.00  87.00   19 0.04871795  179 0.45897436
## 45%  87.00  90.00   25 0.06410256  204 0.52307692
## 50%  90.00  91.95   10 0.02564103  214 0.54871795
## 55%  91.95  94.00   25 0.06410256  239 0.61282051
## 60%  94.00  97.00   16 0.04102564  255 0.65384615
## 65%  97.00 101.30   18 0.04615385  273 0.70000000
## 70% 101.30 107.75   19 0.04871795  292 0.74871795
## 75% 107.75 115.40   20 0.05128205  312 0.80000000
## 80% 115.40 126.00   20 0.05128205  332 0.85128205
## 85% 126.00 174.20   19 0.04871795  351 0.90000000
## 90% 174.20 234.10   19 0.04871795  370 0.94871795
## 95% 234.10 385.00   20 0.05128205  390 1.00000000
dimnames(tab_glu_oji)[[2]]<-c("Linf","Lsup","f","fr","F","Fr")

h1<- data.frame(cbind(tab_glu_oji[,2], tab_glu_oji[,6]))
ojiva<-plot_ly(h1, x= ~tab_glu_oji[,2], y=~tab_glu_oji[,6],
        marker=list(size=15, color="purple"),
        type="scatter", mode="lines")
layout(p=ojiva,title="Ojiva(Glucosa)",
       xaxis=list(title="Límite Superior"),
       yaxis=list(title="%"),
       shapes=list(
               #lineavertical
               list(type="line",x0=0, x1=0, y0=0, y1=1, yref="paper"),
               #lineahorizontal
               list(type="line",x0=0, x1=400, y0=1, y1=1, yref="paper"),
               list(type = 'circle',
                   xref = 'x', x0 = 70, x1 = 80,
                   yref = 'y', y0 =0.12 , y1 = 0.18,
                   fillcolor = 'rgb(50, 20, 90)', line = list(color = 'rgb(50, 20, 90)'),
                   opacity = 0.7),
              list(type = 'circle',
                   xref = 'x', x0 = 89, x1 = 99,
                   yref = 'y', y0 = 0.56, y1 = 0.63,
                   fillcolor = 'rgb(30, 100, 120)', line = list(color = 'rgb(30, 100, 120)'),
                   opacity = 0.7),
               list(type = 'circle', name="Percentil 95%",
                   xref = 'x', x0 = 230, x1 = 240,
                   yref = 'y', y0 = 0.91, y1 = 0.97,
                   fillcolor = 'rgb(90, 200, 75)', line = list(color = 'rgb(90, 200, 75)'),
                   opacity = 0.7)))
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...